import re
import time
import requests
from bs4 import BeautifulSoup
from requests import RequestException

base = "https://www.shubaowo.com"


def get_one_page(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            text = response.text.encode('ISO-8859-1')
            return BeautifulSoup(text, "lxml").prettify()
        print("连接超时，重连中")
        time.sleep(1)
        get_one_page(url)
    except RequestException:
        print("连接超时，重连中")
        time.sleep(1)
        get_one_page(url)


def parse_one_index(html):
    pattern = re.compile('.*?<dd>.*?<a href="(.*?)">.*?</a>.*?</dd>.*?', re.S)
    items = re.findall(pattern, html)
    for item in items:
        item0 = base + item
        yield {
            'index': item0
        }


def parse_one_page(html):
    pattern = re.compile('<h1>(.*?)</h1>.*?<div id="content">(.*?)</div>', re.S)

    items = re.findall(pattern, html)
    for item in items:
        item0 = item[0].replace("\n", "").replace(" ", '') + "\n"
        item1 = item[1].replace("<br/>", "").replace("    ", "  ").replace("\n    \n", "")
        yield {
            'title': item0,
            'content': item1
        }


def write_to_file(content, name):
    with open(name + '.txt', 'a', encoding='utf-8') as f:
        f.write(content + '\n')


if __name__ == '__main__':
    url = "https://www.shubaowo.com/1_1807/"
    url = input("url:")
    url = url.strip()
    page = get_one_page(url)
    title = page.find('h1').string
    print(title)
    page = page.prettify()
    index = parse_one_index(page)
    for index in index:
        html = get_one_page(index.get("index"))
        html = html.prettify()
        page = parse_one_page(html)
        for page in page:
            page_title = page.get("title")
            page_content = page.get("content")
            write_to_file(page_title + page_content, title)
            print(page_title)
